#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier,AdaBoostClassifier,GradientBoostingClassifier,RandomForestClassifier
from sklearn.metrics import accuracy_score,recall_score,precision_score,roc_auc_score,f1_score
from sklearn.model_selection import train_test_split,KFold,GridSearchCV
from sklearn.preprocessing import binarize,minmax_scale,PolynomialFeatures
from sklearn import impute
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import ClusterCentroids
from sklearn.tree import export_graphviz
from collections import Counter
from IPython.display import Image
from sklearn import tree
from os import system
bank_df=pd.read_csv('bank-full.csv',sep=',')
bank_df.head()
#we have 45211 rows and 17 columns.
bank_df.shape
Categorical variables:job,marital,education,default,housing,loan,contact,day,month,poutcome,Target
Quantitative variables:age,balance,duration,campaign,pdays,previous
bank_df.info()
Checking for null values
bank_df.isna().sum()
Checking for missing values
bank_df[bank_df=='?'].sum()
We have 45211 records in total,5 point summary will help to indicate the outliers if any and skeness of the data.we may further need to analyse using different exploratory analysis. marital - almost half of the data belongs to one category and rest to other 2 education - almost half of the data belongs to one category and rest to other 3 default - among 2 most of the data belongs to one category shows skewness in the data. balance - huge std in the balance shows the skewness in data. similarily loan, contact, month ,poutcome and target is having skewness, most belongs to one category
bank_df.describe(include='all').transpose()
Different plots to check on the string and numeric data
i=0
cnt=1
subplt=0
col=['b','g','r','y']
for columns in bank_df.columns:
type=bank_df.dtypes[i]
if cnt in (1,5,9,13,17):
plt.figure(figsize=(26,10))
subplt+=1
cnt=1
plt.subplot(subplt,5,cnt)
plt.title(columns)
if type == 'int64':
sns.boxplot(bank_df[columns],color=col[cnt-1],orient='v')
else:
sns.countplot(bank_df[columns])
i+=1
cnt+=1
Histogram to check on the distribution for numeric attributes
i=0
cnt=1
subplt=0
col=['b','g','r','y']
for columns in bank_df.columns:
type=bank_df.dtypes[i]
if cnt in (1,5,9,13,17):
plt.figure(figsize=(26,10))
subplt+=1
cnt=1
if type == 'int64':
plt.subplot(subplt,5,cnt)
plt.title(columns)
sns.distplot(bank_df[columns])
cnt+=1
i+=1
Checking the correlation
bank_df.corr()
Preparing data to train the model by converting the string variables to numeric attributes
dict_replace_job={'unknown':0,'student':1,'housemaid':2
,'unemployed':3,'entrepreneur':4,'self-employed':5
,'services':6,'retired':7,'admin.':8,'technician':9
,'blue-collar':10,'management':11}
dict_replace_marital={'single':1,'married':2,'divorced':3}
dict_replace_education={'unknown':0,'primary':1,'secondary':2,'tertiary':3}
dict_replace_contact={'unknown':0,'telephone':1,'cellular':2}
dict_replace_month={'jan':1,'feb':2,'mar':3
,'apr':4,'may':5,'jun':6
,'jul':7,'aug':8,'sep':9,'oct':10
,'nov':11,'dec':12}
dict_replace_poutcome={'unknown':0,'other':1,'failure':2
,'success':3}
bank_numeric_df=bank_df.copy()
bank_numeric_df['job']=bank_numeric_df['job'].replace(dict_replace_job)
bank_numeric_df['marital']=bank_numeric_df['marital'].replace(dict_replace_marital)
bank_numeric_df['education']=bank_numeric_df['education'].replace(dict_replace_education)
bank_numeric_df['contact']=bank_numeric_df['contact'].replace(dict_replace_contact)
bank_numeric_df['month']=bank_numeric_df['month'].replace(dict_replace_month)
bank_numeric_df['poutcome']=bank_numeric_df['poutcome'].replace(dict_replace_poutcome)
bank_numeric_df['default']=pd.Series(pd.Categorical(np.array(bank_numeric_df['default'])).codes)
bank_numeric_df['housing']=pd.Series(pd.Categorical(np.array(bank_numeric_df['housing'])).codes)
bank_numeric_df['loan']=pd.Series(pd.Categorical(np.array(bank_numeric_df['loan'])).codes)
bank_numeric_df['Target']=pd.Series(pd.Categorical(np.array(bank_numeric_df['Target'])).codes)
bank_numeric_df.head()
bank_numeric_df.corr()
Heat map to check the correlation
plt.figure(figsize=(26,15))
sns.heatmap(bank_numeric_df.corr(),vmin=-1,vmax=1)
using info() to identify the missing values and datatypes of the source data. i have converted all the data to numeric attributes if we have missing values like '?' or some characters info will return the datatype as object.
bank_numeric_df.info()
The below are the boxplot to check the outliers on the data. we have outliers on previous and that needs to be cleaned. we have value as -1 in pdays for there is no call made previously and all of them are having poutcome as unknown. pdays and previous are impacting poutcome but it is not impactiong directly on the current campaign target.
i=0
cnt=1
subplt=0
col=['b','g','r','y']
for columns in bank_numeric_df.columns:
type=bank_df.dtypes[i]
if cnt in (1,5,9,13,17):
plt.figure(figsize=(26,10))
subplt+=1
cnt=1
plt.subplot(subplt,5,cnt)
plt.title(columns)
sns.boxplot(bank_numeric_df[columns],color=col[cnt-1],orient='v')
i+=1
cnt+=1
The below are the boxplot to check the outliers on the data. we have outliers on previous and that needs to be cleaned. we have value as -1 in pdays for there is no call made previously and all of them are having poutcome as unknown. pdays and previous are impacting poutcome but it is not impactiong directly on the current campaign target.
i=0
cnt=1
subplt=0
col=['b','g','r','y']
for columns in bank_numeric_df.columns:
if cnt in (1,5,9,13,17):
plt.figure(figsize=(26,10))
subplt+=1
cnt=1
plt.subplot(subplt,5,cnt)
plt.title(columns)
sns.distplot(bank_numeric_df[columns])
cnt+=1
i+=1
The below are the box plot to find the attributes impact on the target variables. If there is no overlap it will directly influence the target prediction.
i=0
cnt=1
subplt=0
col=['b','g','r','y']
for columns in bank_numeric_df.columns:
if cnt in (1,5,9,13,17):
plt.figure(figsize=(26,10))
subplt+=1
cnt=1
if columns != 'Target':
plt.subplot(subplt,5,cnt)
plt.title('Target Vs '+ columns)
sns.boxplot(bank_numeric_df['Target'],bank_numeric_df[columns],color=col[cnt-1],orient='v')
i+=1
cnt+=1
Checking pairplot for corelation
sns.pairplot(bank_numeric_df)
clearing the outlier
#To clear the outlier in previous
bank_numeric_df['previous']=[i if i < 200 else round(np.mean(bank_numeric_df['previous'])*100) for i in bank_numeric_df['previous'] ]
#To clear the outlier in balance
bank_numeric_df['balance']=[i if i < 20000 else np.mean(bank_numeric_df['balance']) for i in bank_numeric_df['balance'] ]
#To clear the outlier in campaign
bank_numeric_df['campaign']=[i if i < 15 else np.mean(bank_numeric_df['campaign']) for i in bank_numeric_df['campaign'] ]
sns.boxplot(bank_numeric_df['Target'],bank_numeric_df['previous'],orient='v')
sns.distplot(bank_numeric_df['balance'])
sns.distplot(bank_numeric_df['campaign'])
Overall EDA Summary
Except duration we dont have individual variable to influence the target dependency(duration cannot be used as it will not be available during the time of call). No of times contacted previously have impact on previos outcome on considering that campaign which is no of times contacted on this campaign will also be the important feature for our target variable. Who ever not opted for term deposit have have more chance to opt this time compared to who already opted.Housing is having mild negative corelation which indicates that who ever not opted for housing loan have little chance to opt for term deposit.Who ever contacted thhrough cellular have high chance than other contacted mode.Lot of contacts happened on may month, split based on month may be a factor in segregating the data along with other features.After certain point of balance we have less chance for opting term deposit. Below are the important features identified using the EDA poutcome campaign housing month contact balance
x=bank_numeric_df[ [i for i in bank_numeric_df.columns if i not in ('Target','duration')]]
y=bank_numeric_df['Target']
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.3,random_state=1)
Final_score_cmp=pd.DataFrame(columns=['Score_type','Score_value','Score_model'])
class models():
def __init__(self,name):
self.name=name
def linearfunction(self,xtrain,xtest,ytrain,ytest):
self.model=LogisticRegression()
self.model.fit(xtrain,ytrain)
self.ytest_predicted=self.model.predict(xtest)
def knnfunction(self,n,xtrain,xtest,ytrain,ytest):
self.model=KNeighborsClassifier(n_neighbors=n)
self.model.fit(xtrain,ytrain)
self.ytest_predicted=self.model.predict(xtest)
def NBfunction(self,xtrain,xtest,ytrain,ytest):
self.model=GaussianNB()
self.model.fit(xtrain,ytrain)
self.ytest_predicted=self.model.predict(xtest)
def SVMfunction(self,c,g,xtrain,xtest,ytrain,ytest):
self.model=SVC(C=c,gamma=g)
self.model.fit(xtrain,ytrain)
self.ytest_predicted=self.model.predict(xtest)
def Decisiontreefunction(self,xtrain,xtest,ytrain,ytest):
self.model=DecisionTreeClassifier(random_state=1)
self.model.fit(xtrain,ytrain)
self.ytest_predicted=self.model.predict(xtest)
def PrunedDecisiontreefunction(self,m,xtrain,xtest,ytrain,ytest):
self.model=DecisionTreeClassifier(max_depth=m,random_state=1)
self.model.fit(xtrain,ytrain)
self.ytest_predicted=self.model.predict(xtest)
def Baggingfunction(self,xtrain,xtest,ytrain,ytest):
self.model=BaggingClassifier(random_state=1,n_estimators=50,max_samples=0.8,oob_score=True)
self.model.fit(xtrain,ytrain)
self.ytest_predicted=self.model.predict(xtest)
def AdaBoostfunction(self,xtrain,xtest,ytrain,ytest):
self.model=AdaBoostClassifier(random_state=1,n_estimators=50)
self.model.fit(xtrain,ytrain)
self.ytest_predicted=self.model.predict(xtest)
def GradientBoostfunction(self,xtrain,xtest,ytrain,ytest):
self.model=GradientBoostingClassifier(random_state=1,n_estimators=50)
self.model.fit(xtrain,ytrain)
self.ytest_predicted=self.model.predict(xtest)
def Randomforestfunction(self,xtrain,xtest,ytrain,ytest):
self.model=RandomForestClassifier(random_state=1,n_estimators=50,oob_score=True,max_features=5)
self.model.fit(xtrain,ytrain)
self.ytest_predicted=self.model.predict(xtest)
def scores(self,modelname,ytest,ytest_predicted):
accuracy,recall,precision,f1=accuracy_score(ytest,ytest_predicted),recall_score(ytest,ytest_predicted),precision_score(ytest,ytest_predicted),f1_score(ytest,ytest_predicted)
print('Acccuracy_score for', modelname , 'is ', accuracy)
print('Recall_score for', modelname , 'is ', recall)
print('Precision_score for', modelname , 'is ', precision)
print('F1_score for', modelname , 'is ', f1)
print('\n')
return accuracy,recall,precision,f1
class ensemblemodels():
def __init__(self,name):
self.name=name
def knnfunction(self,n,xtrain,xtest,ytrain,ytest):
self.model=KNeighborsClassifier(n_neighbors=n)
self.model.fit(xtrain,ytrain)
self.ytest_predicted=self.model.predict(xtest)
def NBfunction(self,xtrain,xtest,ytrain,ytest):
self.model=GaussianNB()
self.model.fit(xtrain,ytrain)
self.ytest_predicted=self.model.predict(xtest)
def PrunedDecisiontreefunction(self,m,xtrain,xtest,ytrain,ytest):
self.model=DecisionTreeClassifier(max_depth=m,random_state=1)
self.model.fit(xtrain,ytrain)
self.ytest_predicted=self.model.predict(xtest)
def Baggingfunction(self,base,n_estimator,xtrain,xtest,ytrain,ytest):
self.model=BaggingClassifier(random_state=1,base_estimator=base,n_estimators=n_estimator)
self.model.fit(xtrain,ytrain)
self.ytest_predicted=self.model.predict(xtest)
def Randomforestfunction(self,n_estimator,mf,md,xtrain,xtest,ytrain,ytest):
self.model=RandomForestClassifier(random_state=1,n_estimators=n_estimator,oob_score=True,max_features=mf,max_depth=md)
self.model.fit(xtrain,ytrain)
self.ytest_predicted=self.model.predict(xtest)
def scores(self,modelname,ytest,ytest_predicted):
accuracy,recall,precision,f1=accuracy_score(ytest,ytest_predicted),recall_score(ytest,ytest_predicted),precision_score(ytest,ytest_predicted),f1_score(ytest,ytest_predicted)
print('Acccuracy_score for', modelname , 'is ', accuracy)
print('Recall_score for', modelname , 'is ', recall)
print('Precision_score for', modelname , 'is ', precision)
print('F1_score for', modelname , 'is ', f1)
print('\n')
return accuracy,recall,precision,f1
Evaluating model with all features
#Logistic regression with 0.5 probability
mname='Logic_clf'
Logistic_model=models(mname)
Logistic_model.linearfunction(xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=Logistic_model.scores(Logistic_model.name,ytest,Logistic_model.ytest_predicted)
score_df=pd.DataFrame([['Accuracy',accuracy,mname],['Recall',recall,mname]
,['Precision',precision,mname],['F1',f1,mname]],columns=['Score_type','Score_value','Score_model'])
Final_score_cmp=Final_score_cmp.append(score_df)
#KNN classifier
n=3
mname='KNN_classifier '+str(n)+' neighbour'
KNN_model=models(mname)
KNN_model.knnfunction(n,xtrain,xtest,ytrain,ytest)
KNN_model.scores(KNN_model.name,ytest,KNN_model.ytest_predicted)
#KNN classifier
n=5
mname='KNN_classifier '+str(n)+' neighbour'
KNN_model=models(mname)
KNN_model.knnfunction(n,xtrain,xtest,ytrain,ytest)
KNN_model.scores(KNN_model.name,ytest,KNN_model.ytest_predicted)
#KNN classifier
n=7
mname='KNN_classifier '+str(n)+' neighbour'
KNN_model=models(mname)
KNN_model.knnfunction(n,xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=KNN_model.scores(KNN_model.name,ytest,KNN_model.ytest_predicted)
score_df=pd.DataFrame([['Accuracy',accuracy,mname],['Recall',recall,mname]
,['Precision',precision,mname],['F1',f1,mname]],columns=['Score_type','Score_value','Score_model'])
#KNN classifier
n=10
mname='KNN_classifier '+str(n)+' neighbour'
KNN_model=models(mname)
KNN_model.knnfunction(n,xtrain,xtest,ytrain,ytest)
KNN_model.scores(KNN_model.name,ytest,KNN_model.ytest_predicted)
Final_score_cmp=Final_score_cmp.append(score_df)
#Naive Bayes
mname='NB_classifier'
NB_model=models(mname)
NB_model.NBfunction(xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=NB_model.scores(NB_model.name,ytest,NB_model.ytest_predicted)
score_df=pd.DataFrame([['Accuracy',accuracy,mname],['Recall',recall,mname]
,['Precision',precision,mname],['F1',f1,mname]],columns=['Score_type','Score_value','Score_model'])
Final_score_cmp=Final_score_cmp.append(score_df)
#Decision tree
mname='DT_classifier'
DT_model=models(mname)
DT_model.Decisiontreefunction(xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=DT_model.scores(DT_model.name,ytest,DT_model.ytest_predicted)
m=2
mname='PDT_classifier'
PDT_model=models(mname)
PDT_model.PrunedDecisiontreefunction(m,xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=PDT_model.scores(PDT_model.name,ytest,PDT_model.ytest_predicted)
score_df=pd.DataFrame([['Accuracy',accuracy,mname],['Recall',recall,mname]
,['Precision',precision,mname],['F1',f1,mname]],columns=['Score_type','Score_value','Score_model'])
Final_score_cmp=Final_score_cmp.append(score_df)
#Bagging
mname='Bagging_classifier'
Bagging_model=models(mname)
Bagging_model.Baggingfunction(xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=Bagging_model.scores(Bagging_model.name,ytest,Bagging_model.ytest_predicted)
score_df=pd.DataFrame([['Accuracy',accuracy,mname],['Recall',recall,mname]
,['Precision',precision,mname],['F1',f1,mname]],columns=['Score_type','Score_value','Score_model'])
Final_score_cmp=Final_score_cmp.append(score_df)
#AdaBoosting
mname='AdaBoosting_classifier'
Adaboost_model=models(mname)
Adaboost_model.AdaBoostfunction(xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=Adaboost_model.scores(Adaboost_model.name,ytest,Adaboost_model.ytest_predicted)
score_df=pd.DataFrame([['Accuracy',accuracy,mname],['Recall',recall,mname]
,['Precision',precision,mname],['F1',f1,mname]],columns=['Score_type','Score_value','Score_model'])
Final_score_cmp=Final_score_cmp.append(score_df)
#Bagging with NB
mname='Feature balanced Bagging_classifier NB'
NB_Bagging_model=ensemblemodels(mname)
NB_Bagging_model.Baggingfunction(NB_model.model,50,xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=NB_Bagging_model.scores(NB_Bagging_model.name,ytest,NB_Bagging_model.ytest_predicted)
score_df=pd.DataFrame([['Accuracy',accuracy,mname],['Recall',recall,mname]
,['Precision',precision,mname],['F1',f1,mname]],columns=['Score_type','Score_value','Score_model'])
Final_score_cmp=Final_score_cmp.append(score_df)
#GradientBoosting
mname='GradientBoosting_classifier'
GradientBoost_model=models(mname)
GradientBoost_model.GradientBoostfunction(xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=GradientBoost_model.scores(GradientBoost_model.name,ytest,GradientBoost_model.ytest_predicted)
score_df=pd.DataFrame([['Accuracy',accuracy,mname],['Recall',recall,mname]
,['Precision',precision,mname],['F1',f1,mname]],columns=['Score_type','Score_value','Score_model'])
Final_score_cmp=Final_score_cmp.append(score_df)
#Randomforest
mname='Randomforest_classifier'
Randomforest_model=models(mname)
Randomforest_model.Randomforestfunction(xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=Randomforest_model.scores(Randomforest_model.name,ytest,Randomforest_model.ytest_predicted)
score_df=pd.DataFrame([['Accuracy',accuracy,mname],['Recall',recall,mname]
,['Precision',precision,mname],['F1',f1,mname]],columns=['Score_type','Score_value','Score_model'])
Final_score_cmp=Final_score_cmp.append(score_df)
#SVM
c=.1
g=0.01
mname='SVM_classifier of c & gamma - ' + str(c) + ' & ' + str(g)
SVM_model=models(mname)
SVM_model.SVMfunction(c,g,xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=SVM_model.scores(SVM_model.name,ytest,SVM_model.ytest_predicted)
score_df=pd.DataFrame([['Accuracy',accuracy,mname],['Recall',recall,mname]
,['Precision',precision,mname],['F1',f1,mname]],columns=['Score_type','Score_value','Score_model'])
Final_score_cmp=Final_score_cmp.append(score_df)
grid_model=GridSearchCV(estimator=SVM_model.model,param_grid={'C':[0.1,10,100,1000],'gamma':[0.01,0.1,10,100]},cv=2)
grid_model.fit(x,y)
grid_model.best_params_
grid_model.best_score_
grid_model=GridSearchCV(estimator=DT_model.model,param_grid={'max_depth':[1,2,3,4,5,6,7,8,9,10]},cv=5)
grid_model.fit(x,y)
grid_model.best_params_
grid_model.best_score_
Final_score_cmp
def compare_score(x,y,h):
plt.figure(figsize=(30,15))
plt.title('Score Comparision across models')
sns.barplot(x,y,hue=h)
compare_score(Final_score_cmp['Score_type'],Final_score_cmp['Score_value'],Final_score_cmp['Score_model'])
Evaluating model after balancing the training data
imb_model=SMOTE()
xtrain,ytrain=imb_model.fit_resample(xtrain,ytrain)
Counter(ytrain)
#Logistic regression with 0.5 probability
mname='imb Logic_clf'
Logistic_model=models(mname)
Logistic_model.linearfunction(xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=Logistic_model.scores(Logistic_model.name,ytest,Logistic_model.ytest_predicted)
score_df=pd.DataFrame([['Accuracy',accuracy,mname],['Recall',recall,mname]
,['Precision',precision,mname],['F1',f1,mname]],columns=['Score_type','Score_value','Score_model'])
Final_score_cmp=Final_score_cmp.append(score_df)
#KNN classifier
n=3
mname='imb KNN_classifier '+str(n)+' neighbour'
KNN_model=models(mname)
KNN_model.knnfunction(n,xtrain,xtest,ytrain,ytest)
KNN_model.scores(KNN_model.name,ytest,KNN_model.ytest_predicted)
#KNN classifier
n=5
mname='imb KNN_classifier '+str(n)+' neighbour'
KNN_model=models(mname)
KNN_model.knnfunction(n,xtrain,xtest,ytrain,ytest)
KNN_model.scores(KNN_model.name,ytest,KNN_model.ytest_predicted)
#KNN classifier
n=7
mname='imb KNN_classifier '+str(n)+' neighbour'
KNN_model=models(mname)
KNN_model.knnfunction(n,xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=KNN_model.scores(KNN_model.name,ytest,KNN_model.ytest_predicted)
score_df=pd.DataFrame([['Accuracy',accuracy,mname],['Recall',recall,mname]
,['Precision',precision,mname],['F1',f1,mname]],columns=['Score_type','Score_value','Score_model'])
#KNN classifier
n=10
mname='imb KNN_classifier '+str(n)+' neighbour'
KNN_model=models(mname)
KNN_model.knnfunction(n,xtrain,xtest,ytrain,ytest)
KNN_model.scores(KNN_model.name,ytest,KNN_model.ytest_predicted)
Final_score_cmp=Final_score_cmp.append(score_df)
#Naive Bayes
mname='imb NB_classifier'
NB_model=models(mname)
NB_model.NBfunction(xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=NB_model.scores(NB_model.name,ytest,NB_model.ytest_predicted)
score_df=pd.DataFrame([['Accuracy',accuracy,mname],['Recall',recall,mname]
,['Precision',precision,mname],['F1',f1,mname]],columns=['Score_type','Score_value','Score_model'])
Final_score_cmp=Final_score_cmp.append(score_df)
#Decision tree
mname='imb DT_classifier'
DT_model=models(mname)
DT_model.Decisiontreefunction(xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=DT_model.scores(DT_model.name,ytest,DT_model.ytest_predicted)
m=2
mname='imb PDT_classifier'
PDT_model=models(mname)
PDT_model.PrunedDecisiontreefunction(m,xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=PDT_model.scores(PDT_model.name,ytest,PDT_model.ytest_predicted)
score_df=pd.DataFrame([['Accuracy',accuracy,mname],['Recall',recall,mname]
,['Precision',precision,mname],['F1',f1,mname]],columns=['Score_type','Score_value','Score_model'])
Final_score_cmp=Final_score_cmp.append(score_df)
#Bagging
mname='imb Bagging_classifier'
Bagging_model=models(mname)
Bagging_model.Baggingfunction(xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=Bagging_model.scores(Bagging_model.name,ytest,Bagging_model.ytest_predicted)
score_df=pd.DataFrame([['Accuracy',accuracy,mname],['Recall',recall,mname]
,['Precision',precision,mname],['F1',f1,mname]],columns=['Score_type','Score_value','Score_model'])
Final_score_cmp=Final_score_cmp.append(score_df)
#Bagging with NB
mname='imb balanced Bagging_classifier NB'
NB_Bagging_model=ensemblemodels(mname)
NB_Bagging_model.Baggingfunction(NB_model.model,50,xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=NB_Bagging_model.scores(NB_Bagging_model.name,ytest,NB_Bagging_model.ytest_predicted)
score_df=pd.DataFrame([['Accuracy',accuracy,mname],['Recall',recall,mname]
,['Precision',precision,mname],['F1',f1,mname]],columns=['Score_type','Score_value','Score_model'])
Final_score_cmp=Final_score_cmp.append(score_df)
#AdaBoosting
mname='imb AdaBoosting_classifier'
Adaboost_model=models(mname)
Adaboost_model.AdaBoostfunction(xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=Adaboost_model.scores(Adaboost_model.name,ytest,Adaboost_model.ytest_predicted)
score_df=pd.DataFrame([['Accuracy',accuracy,mname],['Recall',recall,mname]
,['Precision',precision,mname],['F1',f1,mname]],columns=['Score_type','Score_value','Score_model'])
Final_score_cmp=Final_score_cmp.append(score_df)
#GradientBoosting
mname='imb GradientBoosting_classifier'
GradientBoost_model=models(mname)
GradientBoost_model.GradientBoostfunction(xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=GradientBoost_model.scores(GradientBoost_model.name,ytest,GradientBoost_model.ytest_predicted)
score_df=pd.DataFrame([['Accuracy',accuracy,mname],['Recall',recall,mname]
,['Precision',precision,mname],['F1',f1,mname]],columns=['Score_type','Score_value','Score_model'])
Final_score_cmp=Final_score_cmp.append(score_df)
#Randomforest
mname='imb Randomforest_classifier'
Randomforest_model=models(mname)
Randomforest_model.Randomforestfunction(xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=Randomforest_model.scores(Randomforest_model.name,ytest,Randomforest_model.ytest_predicted)
score_df=pd.DataFrame([['Accuracy',accuracy,mname],['Recall',recall,mname]
,['Precision',precision,mname],['F1',f1,mname]],columns=['Score_type','Score_value','Score_model'])
Final_score_cmp=Final_score_cmp.append(score_df)
#SVM
c=.1
g=0.01
mname='imb SVM_classifier of c & gamma - ' + str(c) + ' & ' + str(g)
SVM_model=models(mname)
SVM_model.SVMfunction(c,g,xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=SVM_model.scores(SVM_model.name,ytest,SVM_model.ytest_predicted)
score_df=pd.DataFrame([['Accuracy',accuracy,mname],['Recall',recall,mname]
,['Precision',precision,mname],['F1',f1,mname]],columns=['Score_type','Score_value','Score_model'])
Final_score_cmp=Final_score_cmp.append(score_df)
Tree visualization
Credit_Tree_File = open('credit_tree.dot','w')
dot_data = tree.export_graphviz(DT_model.model, out_file=Credit_Tree_File, feature_names = x.columns, class_names = list('target'))
Credit_Tree_File.close()
retCode = system("dot -Tpng credit_tree.dot -o credit_tree.png")
if(retCode>0):
print("system command returning error: "+str(retCode))
else:
display(Image("credit_tree.png"))
Credit_Tree_File = open('credit_tree.dot','w')
dot_data = tree.export_graphviz(PDT_model.model, out_file=Credit_Tree_File, feature_names = x.columns, class_names = ['0','1'])
Credit_Tree_File.close()
retCode = system("dot -Tpng credit_tree.dot -o credit_tree.png")
if(retCode>0):
print("system command returning error: "+str(retCode))
else:
display(Image("credit_tree.png"))
compare_score(Final_score_cmp[Final_score_cmp['Score_model'].str.contains('imb')]['Score_type'],Final_score_cmp[Final_score_cmp['Score_model'].str.contains('imb')]['Score_value'],Final_score_cmp[Final_score_cmp['Score_model'].str.contains('imb')]['Score_model'])
Getting the feature importance from decision tree
keys=list(x.columns)
values=list(DT_model.model.feature_importances_)
feature_importance_df=pd.DataFrame({'columns':keys,'feature_imp':values})
feature_importance_df.sort_values(by='feature_imp',ascending=False)
keys=list(x.columns)
values=list(PDT_model.model.feature_importances_)
feature_importance_df=pd.DataFrame({'columns':keys,'feature_imp':values})
feature_importance_df.sort_values(by='feature_imp',ascending=False)
Feature selection based on the EDA and feature importance from decision tree
x=bank_numeric_df[ ['housing','poutcome','month','contact','balance']]
y=bank_numeric_df['Target']
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.3,random_state=1)
Final_score_cmp=pd.DataFrame(columns=['Score_type','Score_value','Score_model'])
imb_model=SMOTE()
xtrain,ytrain=imb_model.fit_resample(xtrain,ytrain)
Counter(ytrain)
#KNN classifier
n=5
mname='Feature balanced KNN_classifier '+str(n)+' neighbour'
KNN_model=ensemblemodels(mname)
KNN_model.knnfunction(n,xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=KNN_model.scores(KNN_model.name,ytest,KNN_model.ytest_predicted)
score_df=pd.DataFrame([['Accuracy',accuracy,mname],['Recall',recall,mname]
,['Precision',precision,mname],['F1',f1,mname]],columns=['Score_type','Score_value','Score_model'])
m=3
mname='Feature balanced PDT_classifier'
PDT_model=ensemblemodels(mname)
PDT_model.PrunedDecisiontreefunction(m,xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=PDT_model.scores(PDT_model.name,ytest,PDT_model.ytest_predicted)
score_df=pd.DataFrame([['Accuracy',accuracy,mname],['Recall',recall,mname]
,['Precision',precision,mname],['F1',f1,mname]],columns=['Score_type','Score_value','Score_model'])
Final_score_cmp=Final_score_cmp.append(score_df)
#Naive Bayes
mname='Feature balanced NB_classifier'
NB_model=models(mname)
NB_model.NBfunction(xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=NB_model.scores(NB_model.name,ytest,NB_model.ytest_predicted)
score_df=pd.DataFrame([['Accuracy',accuracy,mname],['Recall',recall,mname]
,['Precision',precision,mname],['F1',f1,mname]],columns=['Score_type','Score_value','Score_model'])
Final_score_cmp=Final_score_cmp.append(score_df)
#Bagging with NB
mname='Feature balanced balanced Bagging_classifier NB'
NB_Bagging_model=ensemblemodels(mname)
NB_Bagging_model.Baggingfunction(NB_model.model,50,xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=NB_Bagging_model.scores(NB_Bagging_model.name,ytest,NB_Bagging_model.ytest_predicted)
score_df=pd.DataFrame([['Accuracy',accuracy,mname],['Recall',recall,mname]
,['Precision',precision,mname],['F1',f1,mname]],columns=['Score_type','Score_value','Score_model'])
Final_score_cmp=Final_score_cmp.append(score_df)
#Bagging with KNN
mname='Feature balanced Bagging_classifier KNN'
KNN_Bagging_model=ensemblemodels(mname)
KNN_Bagging_model.Baggingfunction(KNN_model.model,50,xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=KNN_Bagging_model.scores(KNN_Bagging_model.name,ytest,KNN_Bagging_model.ytest_predicted)
score_df=pd.DataFrame([['Accuracy',accuracy,mname],['Recall',recall,mname]
,['Precision',precision,mname],['F1',f1,mname]],columns=['Score_type','Score_value','Score_model'])
Final_score_cmp=Final_score_cmp.append(score_df)
#Bagging
mname='Feature balanced Bagging_classifier'
Bagging_model=models(mname)
Bagging_model.Baggingfunction(xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=Bagging_model.scores(Bagging_model.name,ytest,Bagging_model.ytest_predicted)
score_df=pd.DataFrame([['Accuracy',accuracy,mname],['Recall',recall,mname]
,['Precision',precision,mname],['F1',f1,mname]],columns=['Score_type','Score_value','Score_model'])
Final_score_cmp=Final_score_cmp.append(score_df)
#AdaBoosting
mname='Feature balanced AdaBoosting_classifier'
Adaboost_model=models(mname)
Adaboost_model.AdaBoostfunction(xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=Adaboost_model.scores(Adaboost_model.name,ytest,Adaboost_model.ytest_predicted)
score_df=pd.DataFrame([['Accuracy',accuracy,mname],['Recall',recall,mname]
,['Precision',precision,mname],['F1',f1,mname]],columns=['Score_type','Score_value','Score_model'])
Final_score_cmp=Final_score_cmp.append(score_df)
#GradientBoosting
mname='Feature balanced GradientBoosting_classifier'
GradientBoost_model=models(mname)
GradientBoost_model.GradientBoostfunction(xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=GradientBoost_model.scores(GradientBoost_model.name,ytest,GradientBoost_model.ytest_predicted)
score_df=pd.DataFrame([['Accuracy',accuracy,mname],['Recall',recall,mname]
,['Precision',precision,mname],['F1',f1,mname]],columns=['Score_type','Score_value','Score_model'])
Final_score_cmp=Final_score_cmp.append(score_df)
#Randomforest
mname='Feature balanced Randomforest_classifier'
Randomforest_model=ensemblemodels(mname)
Randomforest_model.Randomforestfunction(50,2,1,xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=Randomforest_model.scores(Randomforest_model.name,ytest,Randomforest_model.ytest_predicted)
score_df=pd.DataFrame([['Accuracy',accuracy,mname],['Recall',recall,mname]
,['Precision',precision,mname],['F1',f1,mname]],columns=['Score_type','Score_value','Score_model'])
Final_score_cmp=Final_score_cmp.append(score_df)
compare_score(Final_score_cmp[Final_score_cmp['Score_model'].str.contains('Feature balanced')]['Score_type'],Final_score_cmp[Final_score_cmp['Score_model'].str.contains('Feature balanced')]['Score_value'],Final_score_cmp[Final_score_cmp['Score_model'].str.contains('Feature balanced')]['Score_model'])
x=bank_numeric_df[ ['housing','poutcome']]
y=bank_numeric_df['Target']
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.3,random_state=1)
Final_score_cmp=pd.DataFrame(columns=['Score_type','Score_value','Score_model'])
imb_model=SMOTE()
xtrain,ytrain=imb_model.fit_resample(xtrain,ytrain)
Counter(ytrain)
#KNN classifier
n=5
mname='Feature balanced KNN_classifier '+str(n)+' neighbour'
KNN_model=ensemblemodels(mname)
KNN_model.knnfunction(n,xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=KNN_model.scores(KNN_model.name,ytest,KNN_model.ytest_predicted)
score_df=pd.DataFrame([['Accuracy',accuracy,mname],['Recall',recall,mname]
,['Precision',precision,mname],['F1',f1,mname]],columns=['Score_type','Score_value','Score_model'])
m=3
mname='Feature balanced PDT_classifier'
PDT_model=ensemblemodels(mname)
PDT_model.PrunedDecisiontreefunction(m,xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=PDT_model.scores(PDT_model.name,ytest,PDT_model.ytest_predicted)
score_df=pd.DataFrame([['Accuracy',accuracy,mname],['Recall',recall,mname]
,['Precision',precision,mname],['F1',f1,mname]],columns=['Score_type','Score_value','Score_model'])
Final_score_cmp=Final_score_cmp.append(score_df)
#Naive Bayes
mname='Feature balanced NB_classifier'
NB_model=models(mname)
NB_model.NBfunction(xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=NB_model.scores(NB_model.name,ytest,NB_model.ytest_predicted)
score_df=pd.DataFrame([['Accuracy',accuracy,mname],['Recall',recall,mname]
,['Precision',precision,mname],['F1',f1,mname]],columns=['Score_type','Score_value','Score_model'])
Final_score_cmp=Final_score_cmp.append(score_df)
#Bagging with NB
mname='Feature balanced balanced Bagging_classifier NB'
NB_Bagging_model=ensemblemodels(mname)
NB_Bagging_model.Baggingfunction(NB_model.model,50,xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=NB_Bagging_model.scores(NB_Bagging_model.name,ytest,NB_Bagging_model.ytest_predicted)
score_df=pd.DataFrame([['Accuracy',accuracy,mname],['Recall',recall,mname]
,['Precision',precision,mname],['F1',f1,mname]],columns=['Score_type','Score_value','Score_model'])
Final_score_cmp=Final_score_cmp.append(score_df)
#Bagging with KNN
mname='Feature balanced Bagging_classifier KNN'
KNN_Bagging_model=ensemblemodels(mname)
KNN_Bagging_model.Baggingfunction(KNN_model.model,50,xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=KNN_Bagging_model.scores(KNN_Bagging_model.name,ytest,KNN_Bagging_model.ytest_predicted)
score_df=pd.DataFrame([['Accuracy',accuracy,mname],['Recall',recall,mname]
,['Precision',precision,mname],['F1',f1,mname]],columns=['Score_type','Score_value','Score_model'])
Final_score_cmp=Final_score_cmp.append(score_df)
#Bagging
mname='Feature balanced Bagging_classifier'
Bagging_model=models(mname)
Bagging_model.Baggingfunction(xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=Bagging_model.scores(Bagging_model.name,ytest,Bagging_model.ytest_predicted)
score_df=pd.DataFrame([['Accuracy',accuracy,mname],['Recall',recall,mname]
,['Precision',precision,mname],['F1',f1,mname]],columns=['Score_type','Score_value','Score_model'])
Final_score_cmp=Final_score_cmp.append(score_df)
#AdaBoosting
mname='Feature balanced AdaBoosting_classifier'
Adaboost_model=models(mname)
Adaboost_model.AdaBoostfunction(xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=Adaboost_model.scores(Adaboost_model.name,ytest,Adaboost_model.ytest_predicted)
score_df=pd.DataFrame([['Accuracy',accuracy,mname],['Recall',recall,mname]
,['Precision',precision,mname],['F1',f1,mname]],columns=['Score_type','Score_value','Score_model'])
Final_score_cmp=Final_score_cmp.append(score_df)
#GradientBoosting
mname='Feature balanced GradientBoosting_classifier'
GradientBoost_model=models(mname)
GradientBoost_model.GradientBoostfunction(xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=GradientBoost_model.scores(GradientBoost_model.name,ytest,GradientBoost_model.ytest_predicted)
score_df=pd.DataFrame([['Accuracy',accuracy,mname],['Recall',recall,mname]
,['Precision',precision,mname],['F1',f1,mname]],columns=['Score_type','Score_value','Score_model'])
Final_score_cmp=Final_score_cmp.append(score_df)
#Randomforest
mname='Feature balanced Randomforest_classifier'
Randomforest_model=ensemblemodels(mname)
Randomforest_model.Randomforestfunction(50,2,1,xtrain,xtest,ytrain,ytest)
accuracy,recall,precision,f1=Randomforest_model.scores(Randomforest_model.name,ytest,Randomforest_model.ytest_predicted)
score_df=pd.DataFrame([['Accuracy',accuracy,mname],['Recall',recall,mname]
,['Precision',precision,mname],['F1',f1,mname]],columns=['Score_type','Score_value','Score_model'])
Final_score_cmp=Final_score_cmp.append(score_df)
compare_score(Final_score_cmp[Final_score_cmp['Score_model'].str.contains('Feature balanced')]['Score_type'],Final_score_cmp[Final_score_cmp['Score_model'].str.contains('Feature balanced')]['Score_value'],Final_score_cmp[Final_score_cmp['Score_model'].str.contains('Feature balanced')]['Score_model'])
Conclusion: We are trying to predict whether the person will opt for term deposit or not. If the person is opt for term deposit it is a revenue gain to the company. Having few misclassification as going to opt term deposit will not make much impact, only an additional effort but predicting the person may opt for term deposit as not will be a loss to the company. So we have to identify the model which will have good Recall and optimal precision which will be best fit for this situation.
We have evaluated the model first with all the parameters and evaluated the performance.Then balanced the data with same features and perform the evaluation. Later used only the important features from EDA,feature importance and evaluated the perofrmance.Finally balanced the data with important features and evaluated the model.
Based on the above metrics shown,
Before feature selection: All features with imblearn treated
Logistic regression is having good recall. NB and bagging with NB is having good recall. Pruned DT is having good recall. SVM is having good recall but consuming more time.
After feature selection - ['housing','poutcome','month','contact','balance','marital']
NB and bagging with NB is having good recall. Random forest is having good recall but no consistency because of randomness.
After Feature selection ['housing','poutcome']
Logistic Regression,Pruned DT,Adaboosing,Gradient boosting,Bagging having good recall.
hence these algorithms will be best suited for this situation.